Internet Publisher's Toolbox 2.0

home *** CD-ROM | disk | FTP | other *** search

/ Internet Publisher's Toolbox 2.0 / Internet Publisher's Toolbox.iso / internet / ntserver / wtsource / irfiles.c < prev next >

Wrap

C/C++ Source or Header | 1994-11-14 | 67.1 KB | 2,240 lines

/* WIDE AREA INFORMATION SERVER SOFTWARE: No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ /* This file defines the files of an inverted file index. * * This structure is designed to be flexible rather than particularly * optimized for speed or space. * Thus this organization can support: * boolean, proximity, weights, and relevance feedback. * * Ported directly from the Lisp version 1.2 of the search engine. * * -brewster 6/90 */ #ifndef lint static char *RCSid = "$Header: /usr/local/ls63/pfeifer/freeWAIS-0.1-s/ir/RCS/irfiles.c,v 1.4 1993/07/13 19:22:33 huynh1 Exp $"; #endif /* ==================== */ /* === Change Log === */ /*Created 12/4/89 Brewster full lisp version *split from ir-engine 1/11/90 brewster * *added memory indexing for efficiency *added variable index block sizes *5/90 ported to C *5/90 split from irbuild.c *7/90 declared truename() a static function - HWM *7/90 changed filename table and headline table to be null * terminated in the file rather than \newline. * compatibility problems between systems (sigh). * -brewster *7/90 added field to document table for WAIStation * -brewster *7/90 fixed: BUG: when adding words to the word disk hashtable, watch out * for the end of the file and wrap. If it is full, error out. *3/91 took out utilities and created futil.c -brewster *3/91 took out the inverted file and created irinv.c -brewster * * $Log: irfiles.c,v $ * Revision 1.5 1993/07/01 19:34:50 warnock * explicit declaration of gSavepart in savePartMatch * * Revision 1.4 93/07/01 19:18:54 warnock * gethostname -> mygethostname * * Revision 1.3 1993/02/16 17:07:49 freewais * added AT&T patches for keyword list * * Revision 1.2 1993/02/16 15:32:56 freewais * changed directory of servers registration to cnidr.org * * Revision 1.1 1993/02/16 15:05:35 freewais * Initial revision * * Revision 1.63 92/04/28 16:54:41 morris * added boolean support * * Revision 1.62 92/03/20 13:57:04 jonathan * New and Improved server registration. * * Revision 1.61 92/03/19 10:38:27 shen * modified lock to prevent more than one indexing at the same time. * modified lock to block query while initilaizing a database * * Revision 1.60 92/03/19 09:33:35 morris * fixed the dictionary header to accurately indicate the number of blocks * * Revision 1.59 92/02/27 12:25:27 shen * add in locks * * Revision 1.58 92/02/25 16:42:28 jonathan * Added find_pointer_in_block using binary search from * ses@techunix.technion.ac.il. (part of wais-8-b3-ses). * * * Revision 1.57 92/02/25 12:49:16 jonathan * removed a bunch of \n's from waislog's. * * Revision 1.56 92/02/17 16:23:58 jonathan * Modified build_catalog so it passes over the first entry (which seems to be * empty). * * Revision 1.55 92/02/17 12:37:34 jonathan * Added code to build a catalog containing all headlines and DocID's for * documents in the database. * * Revision 1.54 92/02/16 09:50:49 jonathan * plugged some memory leaks. I bet there are more. * * Revision 1.53 92/02/16 09:26:39 jonathan * ask harry. * * Revision 1.52 92/02/12 13:25:12 jonathan * Added "$Log" so RCS will put the log message in the header * */ /* ==================== */ /* ==================== */ /* To Do list * * Implement a filename hashtable so that we can test quickly when * a file has been indexed. * Free up all memory when we can. * Implement logrithmic merging * * change DOC_TAB_ENTRY_FILENAME_ID_SIZE to 4 This must be in version 9 * change DOC_TAB_ENTRY_HEADLINE_ID_SIZE to 4 This must be in version 9 * change DOC_TAB_ENTRY_NUM_LINES_SIZE to 4 This must be in version 9 * change MAX_WORD_LENGTH to 15 This must be in version 9 */ /* A specification for this is called ir-engine.text in microsoft word. */ #include <string.h> /* for memset() */ #include "cutil.h" #include "irfiles.h" #include "panic.h" #include "ustubs.h" /* for strstr */ #include "futil.h" #include "sockets.h" #include "version.h" #include "irext.h" #include "irlex.h" /* for MAX_WORD_LENGTH */ #include "lock.h" #ifdef WIN32 boolean SetInterlock(char *,boolean,boolean); void ResetInterlock(void); int CloseRenameOpen(FILE**,char*,char*,char*); void InitSockets(void); void TermSockets(void); #endif extern char* keyword[50]; extern short nKeys; char *descript[1000]; short nDesLines = 0; #define PRINT_AS_INDEXING false /* also defined in irtfiles.c and irhash.c */ /* ------------------------------- */ #define DOC_TAB_HEADER_SIZE 2 #define DOC_TAB_MAXIMUM_ENTRIES 8192 #define DOC_TAB_ENTRY_FILENAME_ID_SIZE 3 #define DOC_TAB_ENTRY_START_CHAR_SIZE 4 #define DOC_TAB_ENTRY_END_CHAR_SIZE 4 #define DOC_TAB_ENTRY_HEADLINE_ID_SIZE 3 #define DOC_TAB_ENTRY_DOC_LENGTH_SIZE 4 #define DOC_TAB_ENTRY_NUM_LINES_SIZE 3 #define DOC_TAB_ENTRY_DATE_SIZE 4 #define DOC_TAB_ELEMENT_SIZE 25 /* sum of above sizes */ #define DICTIONARY_ENTRY_SIZE 29 /* sum of MAX_WORD_LENGTH, 1 ('\0'), NEXT_INDEX_BLOCK_SIZE and NUMBER_OF_OCCURANCES_SIZE */ #define FILENAME_TABLE_HEADER_SIZE 4 #define HEADLINE_TABLE_HEADER_SIZE 4 #ifdef BIO #define DELIMITERS_SIZE 4 #endif #define FILE_WRITE_DATE_SIZE 4 #define NUMBER_OF_OCCURANCES_SIZE 4 #define DOCUMENT_SCORE_LIMIT_SIZE 1 #define DOCUMENT_SCORE_LIMIT 255 /* this is computed from DOCUMENT_SCORE_LIMIT_SIZE */ #define TIME_WAIT_QUERY_END 5 #define TIMEOUT_WAIT_QUERY_END 45 static char* temp_dictionary_filename _AP((char* destination, database* db)); static long current_lock_type = INVALID_LOCK; /*============================ === Database support === ============================*/ /* looks up the total word count in an existing dictionary. */ boolean look_up_total_word_count _AP((database *db)); boolean look_up_total_word_count(db) database *db; { long word_count; long answer = look_up_word_in_dictionary(DICTIONARY_TOTAL_SIZE_WORD, &word_count, db); if(answer == 0){ waislog(WLOG_HIGH, WLOG_ERROR, "error finding total_word_count in dictionary %s\n", db->database_file); disposeDatabase(db); return(false); } else if(answer < 0){ waislog(WLOG_HIGH, WLOG_ERROR,"total_word_count not found in dictionary\n.This is either an error,or the database is old."); db->total_word_count = word_count; } else{ db->total_word_count = word_count; } /* printf("Total Words in DB: %ld\n", db->total_word_count); */ return(true); } database* openDatabase(name,initialize,for_search) char* name; boolean initialize; boolean for_search; { /* open a database (open all its files), and return an opaque object. return NULL if there is an error */ #ifndef WIN32 unsigned long pid; long timeout; #endif char file[MAX_FILE_NAME_LEN + 1 ]; char tmpfile[MAX_FILE_NAME_LEN + 1]; char open_mode[4]; database* db = (database*)s_malloc((size_t)sizeof(database)); if (db == NULL){ waislog(WLOG_HIGH, WLOG_ERROR, "can't make a database, out of memory.\n"); return(NULL); } db->total_word_count = 0; if (for_search == true) strncpy(open_mode,"rb",3); /* read only for searching */ else strncpy(open_mode,"r+b",4); /* read/write for building */ /* set the query parameter to the original name */ { query_parameter_type parameters; char **list; list=(char **)s_malloc(2*sizeof(char*)); list[0]=s_strdup(name); list[1]=NULL; parameters.srcs = list; set_query_parameter(SET_SELECT_SOURCE,¶meters); } /* ask the backend where the database lives, but put in the directory information that we already have. This changes the 'name' variable. */ db->database_file = s_strdup(merge_pathnames(database_file(pathname_name(name)), pathname_directory(name, tmpfile))); #ifdef WIN32 if ((boolean)SetInterlock(db->database_file,initialize,for_search)==false) return NULL; #else if (for_search == true) { /* check and set appropriate locks */ if( utlk_using_lock(db->database_file, LOCK_UPDATE) ) { waislog(WLOG_HIGH, WLOG_ERROR, "can't search the database as an update is currently running"); return(NULL); } if ( utlk_set_lock(db->database_file, LOCK_QUERY) ) current_lock_type = LOCK_QUERY; else waislog(WLOG_LOW, WLOG_INFO, "query lock can't be set"); } else { if( utlk_using_lock_and_get_pid(db->database_file, LOCK_INDEX, &pid) && (pid != getpid()) ) { waislog(WLOG_HIGH, WLOG_ERROR, "an indexing is currently running on the database. Try again later."); return(NULL); } if ( utlk_set_lock(db->database_file, LOCK_INDEX) ) current_lock_type = LOCK_INDEX; else waislog(WLOG_LOW, WLOG_INFO, "index lock can't be set"); if ( initialize == true ) { /* wait for current query finishing off */ timeout = 0; while ( utlk_using_lock(db->database_file, LOCK_QUERY) ) { if ( timeout >= TIMEOUT_WAIT_QUERY_END ) { waislog(WLOG_HIGH, WLOG_ERROR, "timed out in waiting for a query to finish. Try again later."); utlk_unset_lock(db->database_file, LOCK_INDEX); return(NULL); } waislog(WLOG_LOW, WLOG_INFO, "waiting for a query to finish to initialize the database..."); sleep(TIME_WAIT_QUERY_END); timeout += TIME_WAIT_QUERY_END; } if ( utlk_set_lock(db->database_file, LOCK_UPDATE) ) current_lock_type = LOCK_UPDATE; else waislog(WLOG_LOW, WLOG_INFO, "update lock can't be set"); } } #endif /* WIN32 */ if(initialize == true){ initialize_index_files(db); } else { db->dictionary_stream = s_fopen(dictionary_filename(file, db),open_mode); if (db->dictionary_stream == NULL){ waislog(WLOG_HIGH,WLOG_ERROR,"can't open the word hash file %s\n",file); disposeDatabase(db); return(NULL); } /* find the total_word_count from the dictionary */ if(for_search){ if(false == look_up_total_word_count(db)) { /* side effects db */ disposeDatabase(db); return(NULL); } } db->filename_table_stream = s_fopen(filename_table_filename(file, db),open_mode); if (db->filename_table_stream == NULL){ waislog(WLOG_HIGH, WLOG_ERROR, "can't open the filename file %s", file); disposeDatabase(db); return(NULL); } db->headline_table_stream = s_fopen(headline_table_filename(file, db),open_mode); if (db->headline_table_stream == NULL){ waislog(WLOG_HIGH, WLOG_ERROR, "can't open the headline file %s", file); disposeDatabase(db); return(NULL); } #ifdef BIO db->delimiters_stream = s_fopen(delimiters_filename(file, db),open_mode); if (db->delimiters_stream == NULL){ waislog(WLOG_HIGH, WLOG_ERROR, "can't open the delimiters file %s, using defaults", file); /* disposeDatabase(db); */ /* return(NULL); */ } #endif db->document_table_stream = s_fopen(document_table_filename(file, db),open_mode); if (db->document_table_stream == NULL){ waislog(WLOG_HIGH, WLOG_ERROR, "can't open the document id file %s", file); disposeDatabase(db); return(NULL); } /* initialize the allocated entries variable */ s_fseek(db->document_table_stream, 0L, SEEK_END); db->doc_table_allocated_entries = (ftell(db->document_table_stream) - DOC_TAB_HEADER_SIZE) / DOC_TAB_ELEMENT_SIZE; } db->index_file_number = 0; ext_open_database(db,initialize,for_search); return(db); } void closeDatabase(db) database* db; /* close a database and all its files. Do not dispose of the structure. */ { if (db == NULL) return; close_dictionary_file(db); if (db->dictionary_stream != NULL) s_fclose(db->dictionary_stream); if (db->filename_table_stream != NULL) s_fclose(db->filename_table_stream); if (db->headline_table_stream != NULL) s_fclose(db->headline_table_stream); if (db->document_table_stream != NULL) s_fclose(db->document_table_stream); if (db->index_stream != NULL) s_fclose(db->index_stream); ext_close_database(db); #ifdef WIN32 ResetInterlock(); #else utlk_unset_lock(db->database_file, current_lock_type); if ( current_lock_type == LOCK_UPDATE) utlk_unset_lock(db->database_file, LOCK_INDEX); current_lock_type = INVALID_LOCK; #endif /* WIN32 */ } void disposeDatabase(db) database* db; { closeDatabase(db); s_free(db->database_file); s_free(db); } /* ==================================== */ /* === Initialization of the files === */ /* ==================================== */ #define BLOCK_SIZE 16384 /* size of blocks of zeros to write to a file */ static FILE* initialize_file _AP((long size,char* filename,boolean zero_it)); static FILE* initialize_file(size,filename,zero_it) long size; char* filename; boolean zero_it; /* initializes a file by opening a new stream, making it the right * size and returning the stream. */ { FILE* file = NULL; long i; #ifdef ANSI_LIKE remove(filename); #endif file = s_fopen(filename, "wb"); if(NULL == file){ panic("The file %s could not be opened\n", filename); } if(zero_it){ if(size >= BLOCK_SIZE){ /* then write big blocks of zeros */ char* zeros = NULL; zeros = (char*)s_malloc((size_t)BLOCK_SIZE); if(NULL == zeros){ panic("Could not allocate a large block of Zeros\n"); } memset(zeros, 0, BLOCK_SIZE); while(size >= BLOCK_SIZE){ /* then write big blocks of zeros */ if(BLOCK_SIZE != fwrite(zeros, 1, BLOCK_SIZE, file)) panic("Write failed"); size = size - BLOCK_SIZE; } s_free(zeros); } for(i = 0; i < size; i++){ /* clean up the rest */ putc('\0', file); } } else{ /* dont zero it */ grow_file(file, size); } #ifdef THINK_C /* set the mac file type to INDX */ setFileType(filename, WAIS_INDEX_FILE_TYPE, CREATOR); #endif /* THINK_C */ s_fclose(file); file = s_fopen(filename, "r+b"); /* open it in read/write */ if(NULL == file){ panic("Error in initialization, can not reopen %s.\n", filename); } return(file); } void initialize_index_files (db) database* db; /* This creates new index files, deleting any old ones. */ { char file[MAX_FILENAME_LEN]; /* cprintf(PRINT_AS_INDEXING, "initializing index files: %s\n", db->database_file); */ remove(dictionary_filename(file, db)); /* remove the old one */ db->index_stream = NULL; db->doc_table_allocated_entries = 1; /* the 0th is the null pointer */ db->document_table_stream = initialize_file((DOC_TAB_HEADER_SIZE + DOC_TAB_ELEMENT_SIZE), document_table_filename(file, db), TRUE); db->filename_table_stream = initialize_file(FILENAME_TABLE_HEADER_SIZE, filename_table_filename(file, db), TRUE); db->headline_table_stream = initialize_file(HEADLINE_TABLE_HEADER_SIZE, headline_table_filename(file, db), TRUE); #ifdef BIO db->delimiters_stream = initialize_file(DELIMITERS_SIZE, delimiters_filename(file, db), TRUE); #endif } /* ========================= */ /* === Dictionary File === */ /* ========================= */ /* The dictionary file is a 1 deep tree of blocks. The header of the file says how long the header block is. The "header block" is a set of pointers to the heads of the blocks in the dictionary. A dictionary block is a list of word and pointer pairs. The words are padded to a fixed length so that it is a fixed length record. The pointers are pointers into the inverted file (except in the header block where they are pointers into the dictionary file). */ /* SEARCHING DICTIONARY FILES */ /* top level function: long look_up_word_in_dictionary(char *word, long *word_id, database* db) */ unsigned char *dictionary_header_block = NULL; /* the dictionary header. loaded once */ long number_of_dictionary_blocks = 0; /* also the length of the dictionary header block */ unsigned char *dictionary_block = NULL; /* this is one of the dict blocks */ int dictionary_last_word_occurances; /* This is a temporary hack so I can separate out the relevance feedback changes for posting. DON'T USE THIS ANYWHERE - IT'LL BE GONE SOON */ void close_dictionary_file(db) database *db; { if(dictionary_header_block) s_free(dictionary_header_block); dictionary_header_block = NULL; } static long fread_from_stream _AP((FILE* stream,unsigned char* buf, long nbytes)); static long fread_from_stream(stream,buf,nbytes) FILE *stream; unsigned char *buf; long nbytes; /* this is a safe version of unix 'fread' it does all the checking * and looping necessary */ { long didRead; long toRead = nbytes; long totalRead = 0; /* paranoia */ /*printf("in Fread_from_stream buffer %ld, nbytes %ld\n", (long)buf, nbytes); */ while (toRead > 0){ didRead = fread(buf, sizeof(char), toRead, stream); if(didRead == -1) /* error*/ return(-1); if(didRead == 0) /* eof */ return(-2); /* maybe this should return 0? */ toRead -= didRead; buf += didRead; totalRead += didRead; } if(totalRead != nbytes) /* we overread for some reason */ return(- totalRead); /* bad news */ return(totalRead); } #ifdef DICT_FUNC char *dictionary_block_word(i,block) long i; unsigned char *block; /* returns the word field in the ith dictionary block entry */ { return((char *)(block + (i * DICTIONARY_ENTRY_SIZE))); } long dictionary_block_position(i,block) long i; unsigned char *block; /* returns the position field in the ith dictionary block entry */ { /* printf("dictionary_block_position %ld\n", read_bytes_from_memory (NEXT_INDEX_BLOCK_SIZE, block + (i * DICTIONARY_ENTRY_SIZE) + MAX_WORD_LENGTH + 1)); */ return(read_bytes_from_memory (NEXT_INDEX_BLOCK_SIZE, block + (i * DICTIONARY_ENTRY_SIZE) + MAX_WORD_LENGTH + 1)); } long dictionary_block_word_occurances(i,block) long i; unsigned char *block; /* returns the occurances field in the ith dictionary block entry */ { return(read_bytes_from_memory (NEXT_INDEX_BLOCK_SIZE, block + (i * DICTIONARY_ENTRY_SIZE) + MAX_WORD_LENGTH + 1 + NEXT_INDEX_BLOCK_SIZE)); } #endif #ifdef PARTIALWORD typedef struct { long blocknum, wordcount; } saveparttype; static long gMaxpart = 0; static long gNpart = 0; static long gAtpart = 0; static saveparttype *gSavepart = NULL; void clearPartMatch() { if (gSavepart!=NULL) free(gSavepart); gSavepart= NULL; gMaxpart= 0; gNpart= 0; gAtpart= 0; } void savePartMatch( blocknum, wordcount) long blocknum, wordcount; { if (gNpart>=gMaxpart) { gMaxpart= gNpart + 100; if (gSavepart==NULL) /* (saveparttype*) */ gSavepart= (saveparttype*)malloc(gMaxpart*sizeof(saveparttype)); else /* (saveparttype*) */ gSavepart= (saveparttype*)realloc(gSavepart, gMaxpart*sizeof(saveparttype)); } gSavepart[gNpart].blocknum= blocknum; gSavepart[gNpart].wordcount= wordcount; gNpart++; } #endif static long find_pointer_in_block _AP((char* word,unsigned char* block, long block_length, long *position, boolean findpart )); /* Courtesy of Simon Spero <ses@techunix.technion.ac.il> */ static long find_pointer_in_block(word,block,block_length, position, findpart) char *word; unsigned char *block; long block_length; /* in entries */ long *position; boolean findpart; /* dgg, partial word match */ /* returns 0 if an error or if the word is below the lowest block, (this confusion between error and NULL is bad, but found late in the design process) it returns the positive position if the word is there exactly, and the negative of the position of the word before it if the word is not there exactly. position is set with the entry postion in the block that the word was found. This is used for searching. */ { /* find the entry in the dictionary header for this word. returns 0 if not found. */ /* this could be binary search XXX */ #ifdef WIN32 long i,high,low; #else long i,high,low,tmp; #endif #ifdef PARTIALWORD long wordlen= strlen(word); #endif low = 0; high = block_length; i = (low+high)/2; while(low != high) { long compare; char *dictionary_word = dictionary_block_word(i, block); /* printf("dw = %s, w = %s, low = %d, i = %d, hi = %d\n", dictionary_word,word,low,i,high); */ if(dictionary_word[0] == '\0') { if(high != i) { high = i; i = (low+i)/2; } else { *position = i-1; return(- dictionary_block_position(i-1,block)); } } else { #ifdef PARTIALWORD if (findpart) { compare = strncmp(dictionary_word, word, wordlen); if ((0 == compare) ) { int ati = i; /* save partword matches for later... */ savePartMatch( dictionary_block_position(i, block), dictionary_block_word_occurances(i,block)); while (i>0 && 0 == compare) { --i; dictionary_word = dictionary_block_word(i, block); compare = strncmp(dictionary_word, word, wordlen); if (0 == compare) savePartMatch( dictionary_block_position(i, block), dictionary_block_word_occurances(i,block)); } i= ati; /* Could be compare = 0; while (i<block_length && 0 == compare) { */ while (i<high && 0 == compare) { ++i; dictionary_word = dictionary_block_word(i, block); compare = strncmp(dictionary_word, word, wordlen); if (0 == compare) savePartMatch( dictionary_block_position(i, block), dictionary_block_word_occurances(i,block)); } *position = ati; return(-dictionary_block_position(ati, block)); } } else compare = strcmp(dictionary_word, word); #else compare = strcmp(dictionary_word, word); #endif if(0 == compare) { dictionary_last_word_occurances = dictionary_block_word_occurances(i,block); *position = i; return(dictionary_block_position(i, block)); } if(compare > 0){ if(high != i) { high = i; i = (low+i)/2; } else { *position = i-1; return(- dictionary_block_position(i-1 , block)); } } else { if (low != i) { low = i; #ifdef WIN32 i = (long)((0.5+high+i)/2); #else i = (0.5+high+i)/2; #endif } else { *position = i; return(- dictionary_block_position(i , block)); } } } } if(i == 0) { *position = 0; return(0); } else { *position = i-1; return(- dictionary_block_position(i - 1, block)); } } unsigned char *read_dictionary_block(block,position,length,stream) unsigned char *block; long position; long length; FILE *stream; /* reads the dictionary block from the disk and returns it. block is the place to put it, if it is NULL, then it is malloc'ed. position is the position in the dictionary file to start reading. length is th enumber of entries (not bytes) in the block. stream is the dictionary stream. it returns NULL if it loses. */ { static long last_position = -1; static unsigned char* last_block = NULL; static FILE* last_dict_file = NULL; /* there may be more than one dict */ if (stream != last_dict_file) { /* invalidate the cache */ last_position = -1; last_dict_file = stream; } if(NULL == block) block = (unsigned char *)s_malloc((size_t)(length*DICTIONARY_ENTRY_SIZE)); if ((block != last_block) || (position != last_position)) { last_position = position; last_block = block; s_fseek(stream, position, SEEK_SET); if(0 > fread_from_stream(stream, block, (length * DICTIONARY_ENTRY_SIZE))){ waislog(WLOG_HIGH, WLOG_ERROR, "Could not read the dictionary block %ld, length %ld", block, length); return(NULL); } } return(block); } #ifdef PARTIALWORD long look_up_partialword_in_dictionary(word, number_of_occurances, db) char *word; long *number_of_occurances; database* db; { long answer; boolean findpart = false; if (word != NULL) { clearPartMatch(); answer= look_up_word_in_dictionary( word, number_of_occurances, db); if (answer > 0) return (answer); /* got a match */ } if (gAtpart >= gNpart) { clearPartMatch(); return(-1); } else { answer= gSavepart[gAtpart].blocknum; if (NULL != number_of_occurances) { if (answer > 0) *number_of_occurances = gSavepart[gAtpart].wordcount; else *number_of_occurances = 0; } gAtpart++; return( answer); } } #endif long look_up_word_in_dictionary(word, number_of_occurances, db) char *word; long *number_of_occurances; database* db; /* looks up the word in the dictionary file. Returns the pointer into the inverted file or negative number if not found, or 0 if error. It sets number_of_occurances (if it is not NULL) to the number registered in the file. This is used during searching. It is set to 0 if error or word not found. If it is NULL, then it is not touched. */ { long position; long answer; FILE *stream = db->dictionary_stream; long dictionary_block_pos; boolean findpart = false; /* dgg, PARTIALWORD flag */ #ifdef PARTIALWORD { int l = strlen(word) - 1; if (l > 0 && word[l] == PARTWORD_WILDCARD) { findpart= true; word[l]= '\0'; } } #endif if(NULL == dictionary_header_block) { s_fseek(stream, 0L, SEEK_SET); number_of_dictionary_blocks = read_bytes(DICTIONARY_HEADER_SIZE,stream); dictionary_header_block = read_dictionary_block(dictionary_header_block,DICTIONARY_HEADER_SIZE, number_of_dictionary_blocks,stream); if(NULL == dictionary_header_block) { waislog(WLOG_HIGH, WLOG_ERROR, "Could not read dictionary header block in db %s.", db->database_file); return(0); } } dictionary_block_pos = find_pointer_in_block(word, dictionary_header_block, number_of_dictionary_blocks, &position, false); if(0 == dictionary_block_pos) { /* waislog(WLOG_HIGH, WLOG_ERROR, "Could not find pointer for word '%s' (location %ld) in block in db %s!", word, word, db->database_file); */ return(-1); /* not an error, necessarily if the word is before the first entry */ } dictionary_block = read_dictionary_block(dictionary_block,ABS(dictionary_block_pos), DICTIONARY_BLOCK_SIZE,stream); if(NULL == dictionary_block) { waislog(WLOG_HIGH, WLOG_ERROR, "Could not read dictionary block %ld in db %s", ABS(dictionary_block_pos), db->database_file); return(0); } answer = find_pointer_in_block(word, dictionary_block, DICTIONARY_BLOCK_SIZE, &position, findpart); if((NULL != number_of_occurances)) { if (answer > 0) *number_of_occurances = dictionary_block_word_occurances(position, dictionary_block); else *number_of_occurances = 0; } return(answer); } /* BUILDING DICTIONARY FILES */ long number_of_dictionary_entries; /* number allocated */ char *block_of_zeros = NULL; static void write_zeros_to_stream _AP((long n_bytes,FILE* stream)); static void write_zeros_to_stream(n_bytes,stream) long n_bytes; FILE *stream; /* writes zeros to a file quickly */ { long i; if(n_bytes >= BLOCK_SIZE){ /* then write big blocks of zeros */ if(NULL == block_of_zeros){ block_of_zeros = (char*)s_malloc((size_t)BLOCK_SIZE); memset(block_of_zeros, 0, BLOCK_SIZE); } while(n_bytes >= BLOCK_SIZE){ /* then write big blocks of zeros */ if(BLOCK_SIZE != fwrite(block_of_zeros, sizeof(char), BLOCK_SIZE, stream)) panic("Write failed"); n_bytes -= BLOCK_SIZE; } } for(i = 0; i < n_bytes; i++){ /* clean up the rest */ putc('\0', stream); } } /* returns 0 if successful */ long init_dict_file_for_writing(db) database *db; { char filename[MAX_FILENAME_LEN]; if (db->dictionary_stream != NULL) fclose(db->dictionary_stream); db->dictionary_stream = s_fopen(temp_dictionary_filename(filename, db), "w+b"); db->total_word_count = 0; init_dict_file_detailed(db->dictionary_stream,db->number_of_words); return(0); } static long dict_number_of_blocks _AP((long number_of_words)); static long dict_number_of_blocks(number_of_words) long number_of_words; { long number_of_blocks; number_of_blocks = (number_of_words / DICTIONARY_BLOCK_SIZE) + ((0 == (number_of_words % DICTIONARY_BLOCK_SIZE)) ? 0 : 1); return(number_of_blocks); } void record_num_blocks_in_dict(dictionary_stream,number_of_words) FILE* dictionary_stream; long number_of_words; { /* write the number of blocks */ s_fseek(dictionary_stream, 0L, SEEK_SET); write_bytes(dict_number_of_blocks(number_of_words), DICTIONARY_HEADER_SIZE, dictionary_stream); fseek(dictionary_stream, 0L, SEEK_END); } void init_dict_file_detailed(dictionary_stream,number_of_words) FILE* dictionary_stream; long number_of_words; { /* create space for the table in the front of the file */ write_zeros_to_stream(DICTIONARY_HEADER_SIZE + (DICTIONARY_ENTRY_SIZE * dict_number_of_blocks(number_of_words)), dictionary_stream); record_num_blocks_in_dict(dictionary_stream,number_of_words); number_of_dictionary_entries = 0; } /* this must be called in alphabetical order, and writes the word to the dictionary file. */ long add_word_to_dictionary(word,position,number_of_occurances,db) char *word; long position; long number_of_occurances; database *db; /* Puts a word into the dictionary file. */ { /* assumes the stream has been initialized, and it is positioned at the end */ FILE *stream = db->dictionary_stream; char padded_word[MAX_WORD_LENGTH + 1]; memset(padded_word, 0, MAX_WORD_LENGTH + 1); /* clear the word */ strcpy(padded_word, word); if(0 == (number_of_dictionary_entries % DICTIONARY_BLOCK_SIZE)){ /* then add an entry in the header */ long original_position = s_ftell(stream); long header_entry = number_of_dictionary_entries / DICTIONARY_BLOCK_SIZE; /* printf("Adding header entry %ld %s original pos %ld\n", header_entry, padded_word, original_position); */ fseek(stream, DICTIONARY_HEADER_SIZE + (header_entry * DICTIONARY_ENTRY_SIZE), SEEK_SET); if((MAX_WORD_LENGTH + 1) != fwrite(padded_word, sizeof(char), MAX_WORD_LENGTH + 1, stream)) panic("Write failed"); write_bytes(original_position, NEXT_INDEX_BLOCK_SIZE, stream); write_bytes(0L, NUMBER_OF_OCCURANCES_SIZE, stream); fseek(stream, original_position, SEEK_SET); /* go back to the end */ /* zero the next block */ write_zeros_to_stream(DICTIONARY_ENTRY_SIZE * DICTIONARY_BLOCK_SIZE, stream); fseek(stream, original_position, SEEK_SET); } /* write the word */ if((MAX_WORD_LENGTH + 1) != fwrite(padded_word, sizeof(char), MAX_WORD_LENGTH + 1, stream)) panic("Write failed"); write_bytes(position, NEXT_INDEX_BLOCK_SIZE, stream); write_bytes(number_of_occurances, NUMBER_OF_OCCURANCES_SIZE, stream); number_of_dictionary_entries++; db->total_word_count += number_of_occurances; return(0); } /* this is called after all add_words are done, but before the file is closed. Returns 0 if successful. */ long finished_add_word_to_dictionary(db) database* db; { char temp_filename[MAX_FILENAME_LEN]; char filename[MAX_FILENAME_LEN]; waislog(WLOG_LOW, WLOG_INFO, "Total word count for dictionary is: %ld", db->total_word_count); if(0 != add_word_to_dictionary(DICTIONARY_TOTAL_SIZE_WORD, 1, db->total_word_count, db)) return(-1); record_num_blocks_in_dict(db->dictionary_stream,db->number_of_words); fflush(db->dictionary_stream); /* so that any new opens will see a valid file */ /* rename the .dcttmp file to dct */ temp_dictionary_filename(temp_filename, db); dictionary_filename(filename, db); /* printf("renaming %s to %s\n", temp_filename, filename); */ #ifdef WIN32 if (0!=(int)CloseRenameOpen(&(db->dictionary_stream),temp_filename,filename,"r+b")) waislog(WLOG_HIGH, WLOG_ERROR, "could not rename file %s to %s", temp_filename, filename); #else if(0 != rename(temp_filename, filename)) waislog(WLOG_HIGH, WLOG_ERROR, "could not rename file %s to %s", temp_filename, filename); #endif /* WIN32 */ return(0); } void print_dictionary_block(block,size) unsigned char *block; long size; /* this prints the contents of a dictionary block */ { long i; for(i = 0; i < size; i++){ char *word = dictionary_block_word(i, block); if(word[0] == '\0') break; /* I assume this is only for debugging - JG */ printf("Entry %3ld: %21s %7ld %7ld\n", i, word, dictionary_block_position(i, block), dictionary_block_word_occurances(i, block)); } } void print_dictionary _AP((database* db)); void print_dictionary(db) database *db; { /* prints the contents of a dictionary */ FILE *stream = db->dictionary_stream; long i; long new_number_of_dictionary_blocks; if(NULL == stream) panic("dictionary stream is not open"); s_fseek(stream, 0L, SEEK_SET); new_number_of_dictionary_blocks = read_bytes(DICTIONARY_HEADER_SIZE, stream); if(new_number_of_dictionary_blocks > number_of_dictionary_blocks) dictionary_header_block = NULL; number_of_dictionary_blocks = new_number_of_dictionary_blocks; printf("Number of dictionary blocks %ld\n", number_of_dictionary_blocks); if(NULL == (dictionary_header_block = read_dictionary_block(dictionary_header_block, DICTIONARY_HEADER_SIZE, number_of_dictionary_blocks, stream))) panic("Could not read dictionary header block"); printf("The Dictionary Header Block:\n"); print_dictionary_block(dictionary_header_block, number_of_dictionary_blocks); for(i = 0; i < number_of_dictionary_blocks; i++){ long pos = dictionary_block_position(i, dictionary_header_block); if(NULL == (dictionary_block = read_dictionary_block(dictionary_block, pos, DICTIONARY_BLOCK_SIZE, stream))) panic("Could not read dictionary block %ld", pos); printf("\n\nDictionary block %ld (position %ld):\n", i, pos); print_dictionary_block(dictionary_block, DICTIONARY_BLOCK_SIZE); } fseek(stream, 0L, SEEK_END); } #ifdef testing /* dictionary testing code */ static void check_dictionary_entry _AP((char* word,long expected_position, database* db)); static void check_dictionary_entry(word,expected_position,db) char *word; long expected_position; database *db; { if(expected_position != look_up_word_in_dictionary(word, NULL, db)) { waislog(WLOG_HIGH, WLOG_ERROR, "%s should be %ld is %ld in db %s", word, expected_position, look_up_word_in_dictionary(word, NULL, db), db->database_file); } } static void test_dictionary _AP((database* db)); static void test_dictionary(db) database *db; /* this is just an trivial test */ { db->number_of_words = 3; init_dict_file_for_writing(db); add_word_to_dictionary("aardvark", 123L, 0l, db); add_word_to_dictionary("house", 234L, 0L, db); add_word_to_dictionary("mary", 345L, 0L, db); fflush(db->dictionary_stream); print_dictionary(db); check_dictionary_entry("aardvark", 123L, db); check_dictionary_entry("house", 234L, db); check_dictionary_entry("mary", 345L, db); check_dictionary_entry("food", -123L, db); check_dictionary_entry("zebra", -345L, db); check_dictionary_entry("aaarf", 0L, db); } #endif /* def testing */ /*========================* *=== Document Table ===* *========================*/ boolean read_document_table_entry(doc_entry,number,db) document_table_entry* doc_entry; long number; database* db; /* returns a document_table_entry on the stack */ { long position; FILE *stream = db->document_table_stream; position = (DOC_TAB_HEADER_SIZE + ((long)number * (long)DOC_TAB_ELEMENT_SIZE)); if (0 != fseek(stream, position, SEEK_SET)) { waislog(WLOG_HIGH, WLOG_ERROR, "fseek failed into the document table to position %ld in db %s", position, db->database_file); return(false); } doc_entry->filename_id = read_bytes(DOC_TAB_ENTRY_FILENAME_ID_SIZE, stream); doc_entry->headline_id = read_bytes(DOC_TAB_ENTRY_HEADLINE_ID_SIZE, stream); doc_entry->start_character = read_bytes(DOC_TAB_ENTRY_START_CHAR_SIZE, stream); doc_entry->end_character = read_bytes(DOC_TAB_ENTRY_END_CHAR_SIZE, stream); doc_entry->document_length = read_bytes(DOC_TAB_ENTRY_DOC_LENGTH_SIZE, stream); doc_entry->number_of_lines = read_bytes(DOC_TAB_ENTRY_NUM_LINES_SIZE, stream); doc_entry->date = read_bytes(DOC_TAB_ENTRY_DATE_SIZE, stream); if (doc_entry->date == EOF) { return(false); } /*printf("read_document_table_entry pos %ld val %lx\n",position,doc_entry->date);*/ return(true); } boolean writeUserValToDocIDTable(userVal,doc,db) unsigned long userVal; long doc; database* db; /* the docIDTable needs to keep a user value for use by other indexing systems. Currently it is stuffed in the date field. This routine needs to be updated if read_document_table_entry changes */ { long position; position = (DOC_TAB_HEADER_SIZE + ((long)doc * (long)DOC_TAB_ELEMENT_SIZE) + DOC_TAB_ENTRY_FILENAME_ID_SIZE + DOC_TAB_ENTRY_HEADLINE_ID_SIZE + DOC_TAB_ENTRY_START_CHAR_SIZE + DOC_TAB_ENTRY_END_CHAR_SIZE + DOC_TAB_ENTRY_DOC_LENGTH_SIZE + DOC_TAB_ENTRY_NUM_LINES_SIZE); if (0 != fseek(db->document_table_stream,position,SEEK_SET)) { waislog(WLOG_HIGH, WLOG_ERROR, "fseek failed into the document table to position %ld in db %s", position,db->database_file); return(false); } /*printf("writeUserValToDocIDTable pos %ld val %lx\n",position,userVal);*/ write_bytes(userVal,DOC_TAB_ENTRY_DATE_SIZE,db->document_table_stream); fflush(db->document_table_stream); return(true); } #ifdef testing static boolean check_document_id _AP((long doc_id,database* db)); static boolean check_document_id(doc_id,db) long doc_id; database* db; /* returns true if that is a valid doc_id (corresponds to a file that has not been deleted */ { long position; FILE *stream = db->document_table_stream; long filename_id; char filename[MAX_FILE_NAME_LEN]; position = (DOC_TAB_HEADER_SIZE + ((long)doc_id * (long)DOC_TAB_ELEMENT_SIZE)); if (0 != fseek(stream, position, SEEK_SET)) { waislog(WLOG_HIGH, WLOG_ERROR, "fseek failed into the document table to position %ld in db %s", position, db->database_file); return(false); } filename_id = read_bytes(DOC_TAB_ENTRY_FILENAME_ID_SIZE, stream); /* probe the file. Is there a faster way? */ return(probe_file_possibly_compressed(read_filename_table_entry(filename_id, filename,NULL,db))); } #endif long write_document_table_entry(doc_table_entry, db) document_table_entry* doc_table_entry; database* db; { /* returns the document_id */ s_fseek(db->document_table_stream, (DOC_TAB_HEADER_SIZE + (db->doc_table_allocated_entries * DOC_TAB_ELEMENT_SIZE)), SEEK_SET); /* write the pieces */ write_bytes(doc_table_entry->filename_id, DOC_TAB_ENTRY_FILENAME_ID_SIZE, db->document_table_stream); write_bytes(doc_table_entry->headline_id, DOC_TAB_ENTRY_HEADLINE_ID_SIZE, db->document_table_stream); write_bytes(doc_table_entry->start_character, DOC_TAB_ENTRY_START_CHAR_SIZE, db->document_table_stream); write_bytes(doc_table_entry->end_character, DOC_TAB_ENTRY_END_CHAR_SIZE, db->document_table_stream); write_bytes(doc_table_entry->document_length, DOC_TAB_ENTRY_DOC_LENGTH_SIZE, db->document_table_stream); /* printf("Writing %ld lines\n", document_table_entry->number_of_lines); */ write_bytes(doc_table_entry->number_of_lines, DOC_TAB_ENTRY_NUM_LINES_SIZE, db->document_table_stream); write_bytes(doc_table_entry->date, DOC_TAB_ENTRY_DATE_SIZE, db->document_table_stream); db->doc_table_allocated_entries++; return(db->doc_table_allocated_entries); } long next_document_id(db) database* db; { return(db->doc_table_allocated_entries); } /*========================* *=== Filename table ===* *========================*/ #ifndef MAXPATHLEN /* think_c does not define it for instance */ #define MAXPATHLEN 2000 #endif /* MAXPATHLEN */ static char *read_filename_table_stream _AP((long position, char* filename, char* type, time_t* file_write_date, FILE *stream)); static char *read_filename_table_stream(position,filename,type, file_write_date, stream) long position; char* filename; char* type; time_t* file_write_date; FILE *stream; { /* Returns the filename array after side effecting it, * or NULL if an error. * The type of the file is put in the argument "type". This will * not be longer than MAX_FILE_NAME_LEN. * * if type is NULL then ignore it, * if file_write_date is NULL then ignore it, * If position is -1, then it does not seek. * * Leave the file positioned at the start of the next entry. */ long file_write_date_internal; char type_internal[MAX_TYPE_LEN]; if(NULL == stream) return(NULL); if(NULL == type) /* this means we do not care, so set up a dummy */ type = type_internal; filename[0] = '\0'; /* init to the empty string */ if(NULL != type) type[0] = '\0'; /* init to the empty string */ if(position != -1){ if (0 != fseek(stream, position, SEEK_SET)){ waislog(WLOG_HIGH, WLOG_ERROR, "fseek failed into the filename index to position %ld", position); return(NULL); } } if(false == read_string_from_file(stream, filename, MAX_FILE_NAME_LEN)){ return(NULL); } else{ file_write_date_internal = read_bytes(FILE_WRITE_DATE_SIZE, stream); if(file_write_date){ *file_write_date = (time_t)file_write_date_internal; } if(false == read_string_from_file(stream, type, MAX_TYPE_LEN)){ return(NULL); } } return(filename); } char *read_filename_table_entry(position,filename,type,file_write_date,db) long position; char* filename; char* type; time_t* file_write_date; database* db; { /* Returns the filename array after side effecting it, * or NULL if an error. * The type of the file is put in the argument "type". This will * not be longer than MAX_FILE_NAME_LEN. * * if type is NULL then ignore it, * if file_write_date is NULL then ignore it, * If position is -1, then it does not seek. * * Leave the file positioned at the start of the next entry. */ FILE *stream = db->filename_table_stream; return(read_filename_table_stream(position,filename,type, file_write_date,stream)); } long write_filename_table_entry(filename,type,db) char* filename; char *type; database* db; { /* writes the filename (NULL terminated), followed by 4 bytes of creation date, followed by the file type (NULL terminated), Returns the postion of the filename */ long free_position,count,i,j; char full_path[MAXPATHLEN]; char savedFileName[MAX_FILENAME_LEN + 1]; char* tmp_type = NULL; /* temporary type */ char* tmp_type_pointer = NULL; /* temporary type pointer */ s_fseek(db->filename_table_stream, 0L, SEEK_END); free_position = ftell(db->filename_table_stream); /* add the filename to the hashtable not done yet XXX (setf (gethash filename *filename_table_hashtable*) (file_write_date filename)) */ fprintf(db->filename_table_stream, "%s", truename(filename, full_path)); fputc(0, db->filename_table_stream); if(FILE_WRITE_DATE_SIZE != sizeof(time_t)){ /* check if these are the same */ panic("We have a problem with the file_write_date_size\n"); } write_bytes((long)file_write_date(filename), FILE_WRITE_DATE_SIZE, db->filename_table_stream); /* fwrite(type, sizeof(char), strlen(type) + 1, db->filename_table_stream);*/ /* francois - multitype extensions */ /* Here we just add the document types to the file entry, we need to check to see if each file is there so we probe them. */ if ( strstr(type,",") == NULL ) { fprintf(db->filename_table_stream, "%s",type); fputc(0,db->filename_table_stream); } else { /* count up the number of document types */ count = 1L; #ifdef WIN32 for (i = 0L; i < (long)strlen(type); i++){ #else for (i = 0L; i < strlen(type); i++){ #endif if ( type[i] == ',' ) count++; } /* duplicate the type and save the pointer */ tmp_type = s_strdup(type); tmp_type_pointer = tmp_type; /* append types - NULL out the pointer so that strtok can grab the subsequent entries */ for (i = 0L; i < count; i++ ) { tmp_type_pointer = s_strdup(strtok(tmp_type_pointer,",")); strcpy(savedFileName,filename); if ( strcmp(savedFileName+(strlen(savedFileName)-2), ".Z") == 0 ) { /* it's a .Z file. First, remove the suffix or many things get confused. */ savedFileName[(strlen(savedFileName)-2)] = 0; } /* strip the current extension, but not the period */ for ( j = strlen(savedFileName); j >= 0L; j-- ) { if (savedFileName[j] == '.') { savedFileName[j+1] = 0; break; } } /* append the type to the file name */ strcat(savedFileName,tmp_type_pointer); if(probe_file_possibly_compressed(savedFileName)) { fprintf(db->filename_table_stream, "%s",tmp_type_pointer); fprintf(db->filename_table_stream, ","); } s_free(tmp_type_pointer); tmp_type_pointer = NULL; } /* release the tmp_type allocations */ s_free(tmp_type); /* terminate the string */ fputc(0,db->filename_table_stream); } return(free_position); } /* functions to figure out if the file is in the index already */ static boolean filename_in_filename_stream _AP((char *filename, char *type, time_t *file_write_date, FILE *stream)); static boolean filename_in_filename_stream(filename, type, file_write_date, stream) char *filename; char *type; time_t *file_write_date; FILE *stream; /* returns true if it is there (and side effects type and file_write_date). leaves the stream at the end of the file. If type or file_write_date is NULL, then it is a dont care. type, if it is an array, should be MAX_FILENAME_LEN long at least. */ { /* this is slow because it loops through the whole file every time. this might want to be optimized by making a hashtable. */ char next_filename[MAX_FILENAME_LEN]; s_fseek(stream, FILENAME_TABLE_HEADER_SIZE, SEEK_SET); while(!feof(stream)){ char new_type[MAX_FILENAME_LEN]; if(NULL == read_filename_table_stream(-1, next_filename, new_type, file_write_date, stream)) return(false); #ifdef WIN32 if(0 == _stricmp(next_filename, filename)) #else if(0 == strcmp(next_filename, filename)) #endif return(true); } } boolean filename_in_database(filename,type,file_write_date,db) char *filename; char *type; time_t *file_write_date; database *db; { return(filename_in_filename_stream(filename, type, file_write_date, db->filename_table_stream)); } /* this caches the last filename that was found to be in the filename file, this way repeated attempts to figure out if a file is there will be fast. This is the case when retrieving successive blocks of a file. */ char last_filename_found_in_file[MAX_FILE_NAME_LEN]; char last_filename_file[MAX_FILE_NAME_LEN]; boolean filename_in_filename_file(filename,type,file_write_date, filename_file) char *filename; char *type; time_t *file_write_date; char *filename_file; { if(NULL == filename) return(false); if(0 == strcmp(last_filename_found_in_file, filename) && 0 == strcmp(last_filename_file, filename_file)) return(true); else #ifdef WIN32 { FILE *stream = s_fopen(filename_file, "rb"); #else { FILE *stream = s_fopen(filename_file, "r"); #endif boolean answer; if(NULL == stream) { s_fclose(stream); return(false); } answer = filename_in_filename_stream(filename,type,file_write_date, stream); if(answer == true) { /* record it in the cache */ strncpy(last_filename_file, filename_file, MAX_FILE_NAME_LEN); strncpy(last_filename_found_in_file, filename, MAX_FILE_NAME_LEN); } s_fclose(stream); return(answer); } } /*========================* *=== Headline Table ===* *========================*/ char *read_headline_table_entry(position,db) long position; database* db; /* returns the headline array after side effecting it. Beware that * the next call to this function will overwrite the the headline_array */ { /* this is the headline that gets returned */ static char headline_array[MAX_HEADLINE_LEN]; FILE *stream = db->headline_table_stream; headline_array[0] = '\0'; /* init to the empty string */ if (0 != fseek(stream, position, SEEK_SET)) { waislog(WLOG_HIGH, WLOG_ERROR, "fseek failed into the headline index to position %ld in db %s", position, db->database_file); return(headline_array); } if(false == read_string_from_file(db->headline_table_stream, headline_array, MAX_FILE_NAME_LEN)){ waislog(WLOG_HIGH, WLOG_ERROR, "headline table is corrupt at %ld in db %s", position, db->database_file); } return(headline_array); } /* writes the string to the file followed by a NULL. * The returned number is the position in the file to start reading. */ long write_headline_table_entry(headline,db) char* headline; database* db; { /* writes the headline followed by a newline. Returns the postion of the headline. */ long free_position; s_fseek(db->headline_table_stream, 0L, SEEK_END); free_position = ftell(db->headline_table_stream); /* printf("Headline position: %ld, next headline length: %ld\n", free_position, strlen(headline)); */ fprintf(db->headline_table_stream, "%s", headline); fputc(0, db->headline_table_stream); return(free_position); } #ifdef BIO /*========================* *=== delimiters - dgg ===* *========================*/ char *read_delimiters(db) database* db; /* returns the word delimiters for a database. Beware that * the next call to this function will overwrite the the headline_array */ { static char delimiters[MAX_HEADLINE_LEN+1]; FILE *stream = db->delimiters_stream; delimiters[0] = '\0'; /* init to the empty string */ if(false == read_string_from_file(db->delimiters_stream, delimiters, MAX_HEADLINE_LEN)){ waislog(WLOG_HIGH, WLOG_ERROR, "delimiters are corrupt in db %s", db->database_file); } /* need to weed out .dlm files that have no symbols... */ if (delimiters[0] == '\0') return(NULL); else return(delimiters); } /* writes the string to the file followed by a NULL. * The returned number is the position in the file to start reading. */ long write_delimiters(delimiters,db) char* delimiters; database* db; { /* writes the headline followed by a newline. Returns the postion of the headline. */ long free_position; s_fseek(db->delimiters_stream, 0L, SEEK_SET); /* _SET, only one set of delims / file ? */ free_position = ftell(db->delimiters_stream); fprintf(db->delimiters_stream, "%s", delimiters); fputc(0, db->delimiters_stream); return(free_position); } #endif /* =================== */ /* === Source file === */ /* =================== */ /* the source file is an ascii file for describing a source. it is defined in ../doc/source.txt */ /* Registers the src structure with the directory of servers. Return true if successful */ boolean register_src_structure(filename) char *filename; { #ifndef WIN32 char string[200], *editor; long answer; #endif #ifdef WIN32 printf("\n"); printf("Please look over the source description in %s\n",filename); printf("Be sure it contains an IP address and DNS name, as well as\n"); printf("the port you intend to use for the WAIS server.\n\n"); printf("This program does not automatically register sources with the directory\n"); printf("of servers. You must mail the %s file manually\n",filename); printf("to the following addresses:\n"); printf(" wais-directory-of-servers@cnidr.org\n"); printf(" wais-directory-of-servers@quake.think.com\n"); return true; #else if((editor = (char*)getenv("EDITOR")) == NULL && (editor = (char*)getenv("VISUAL")) == NULL) { printf("Could not get EDITOR environment variable.\n"); printf("Please check over the source structure: %s\n", filename); printf("Then mail it to wais-directory-of-servers@cnidr.org\n"); return (false); } /* register the server with the directory of servers */ printf("Please look over the Source description. Be sure it contains\n"); printf("an IP address and DNS name, as well as the port you intend\n"); printf("to use for the server.\n"); printf("When you are finished it will be mailed to the directory of servers.\n"); fflush(stdout); sprintf(string, "exec %s %s", editor, filename); system(string); printf("\nSending source structure to the CNIDR directory of servers..."); sprintf(string, "cat %s | mail wais-directory-of-servers@cnidr.org %s\n", filename, getenv("USER")); answer = system(string); printf("\nSending source structure to the TM directory of servers..."); sprintf(string, "cat %s | mail wais-directory-of-servers@quake.think.com %s\n", filename, getenv("USER")); answer = system(string); printf("Done.\n"); return((answer == 0)?true:false); #endif } /* Writes a source structure to a file. If the export_database arg is set, then the tcp_port is used in the tcp-port slot. Returns true if successful. */ boolean write_src_structure(filename, database_name, typename, filenames, count, export_database, tcp_port) char *filename; char *database_name; char *typename; char **filenames; long count; boolean export_database; long tcp_port; { long i,j; char hostname[120]; struct hostent *h; #ifndef THINK_C #ifndef M_XENIX FILE *source_stream = s_fopen(filename, "w"); fprintf(source_stream, "\n\n(:source \n"); fprintf(source_stream, " :version 3 \n"); if(export_database){ #ifdef WIN32 (void)InitSockets(); hostname[0] = '\0'; #endif mygethostname(hostname, 120); h = gethostbyname(hostname); #ifdef WIN32 (void)TermSockets(); #endif if (h != NULL && h->h_addr_list != NULL && h->h_addr_list[0] != NULL) { fprintf(source_stream, " :ip-address \"%d.%d.%d.%d\"\n", (unsigned char)h->h_addr_list[0][0], (unsigned char)h->h_addr_list[0][1], (unsigned char)h->h_addr_list[0][2], (unsigned char)h->h_addr_list[0][3] ); } fprintf(source_stream, " :ip-name \"%s\"\n", hostname ); fprintf(source_stream, " :tcp-port %ld\n", tcp_port); } fprintf(source_stream, " :database-name \"%s\"\n", database_name); fprintf(source_stream, " :cost 0.00 \n"); fprintf(source_stream, " :cost-unit :free \n"); fprintf(source_stream, " :maintainer \"%s\"\n", current_user_name()); fprintf(source_stream, " :keyword-list (\n"); for (j=0; j< nKeys; j++) { fprintf(source_stream, " %s\n", keyword[j]); } fprintf(source_stream, " )\n"); if(!nDesLines){ fprintf(source_stream, " :description \"Server created with %s on %s by %s\n", VERSION, printable_time(), current_user_name()); if(count > 0){ #ifdef sgi fprintf(source_stream, "Files of type %s were used in the index.\n", typename); #else fprintf(source_stream, "The files of type %s used in the index were:\n", typename); for(i = 0; i < count; i++){ char full_path[MAX_FILENAME_LEN + 1]; fprintf(source_stream, " %s\n", truename(filenames[i], full_path)); } #endif } fprintf(source_stream, "\"\n"); } else for (j=0; j<nDesLines; j++) fprintf(source_stream, "%s", descript[j]); fprintf(source_stream, ")\n"); s_fclose(source_stream); #endif /* ndef M_XENIX */ #endif /* ndef THINK_C */ return(true); } boolean build_catalog(db) database* db; { char catalog_name[MAX_FILENAME_LEN]; document_table_entry doc_entry; char filename[MAX_FILE_NAME_LEN], type[100]; FILE *catalog; long i; sprintf(catalog_name,"%s%s",db->database_file, catalog_ext); if((catalog = s_fopen(catalog_name, "w")) == NULL) { waislog(WLOG_HIGH, WLOG_ERROR, "Unable to open catalog file for write: %s.", catalog_name); return(false); } fprintf(catalog, "Catalog for database: %s\n", db->database_file); fprintf(catalog, "Date: %s\n", printable_time()); /* the first document is empty - JG */ fprintf(catalog, "%ld total document%s\n\n", db->doc_table_allocated_entries-1, (db->doc_table_allocated_entries==2) ? "":"s"); for(i=1; i<db->doc_table_allocated_entries; i++) { /* fprintf(catalog, "Document # %ld\n", i); */ if (read_document_table_entry(&doc_entry, i, db) == true){ char *hl; long hll; read_filename_table_entry(doc_entry.filename_id, filename, type, NULL, db); fprintf(catalog, "Document # %ld Type: %s\n", i,type); hl = read_headline_table_entry(doc_entry.headline_id,db); hll = strlen(hl); fprintf(catalog, "Headline: %s", hl); if((hll== 0) || (hl[hll-1] != '\n')) fprintf(catalog,"\n"); fprintf(catalog, "DocID: %d %d %s\n\n", doc_entry.start_character, doc_entry.end_character, filename); } else { fprintf(catalog, "Unable to read document table for document %n!\n\n", i); } } s_fclose(catalog); return(true); } /*****************************/ /*** Database support ***/ /*****************************/ char* dictionary_filename(destination,db) char* destination; database* db; { strncpy(destination, db->database_file,MAX_FILE_NAME_LEN); s_strncat(destination,dictionary_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN); return(destination); } /* for use in building so that the real one does not get overstomped */ static char* temp_dictionary_filename(destination,db) char* destination; database* db; { strncpy(destination, db->database_file,MAX_FILE_NAME_LEN); s_strncat(destination,dictionary_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN); s_strncat(destination,"tmp",MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN); return(destination); } char* document_table_filename(destination,db) char* destination; database* db; { strncpy(destination, db->database_file,MAX_FILE_NAME_LEN); s_strncat(destination,document_table_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN); return(destination); } char* filename_table_filename(destination,db) char* destination; database* db; { strncpy(destination, db->database_file,MAX_FILE_NAME_LEN); s_strncat(destination,filename_table_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN); return(destination); } char* headline_table_filename(destination,db) char* destination; database* db; { strncpy(destination, db->database_file,MAX_FILE_NAME_LEN); s_strncat(destination,headline_table_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN); return(destination); } #ifdef BIO char* delimiters_filename(destination,db) char* destination; database* db; { strncpy(destination, db->database_file,MAX_FILE_NAME_LEN); s_strncat(destination,delimiters_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN); return(destination); } #endif char* index_filename(destination,db) char* destination; database* db; { strncpy(destination, db->database_file,MAX_FILE_NAME_LEN); s_strncat(destination,index_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN); return(destination); } /* this is used during index creation. if the version is -2, then it means the real index_filename. This is a kludge */ char* index_filename_with_version(version,destination,db) long version; char* destination; database* db; { if(version == -2L){ return(index_filename(destination, db)); } else{ sprintf(destination, "%s%s%ld", db->database_file, index_ext, version); return(destination); } } char* source_filename(destination,db) char* destination; database* db; { strncpy(destination, db->database_file,MAX_FILE_NAME_LEN); s_strncat(destination,source_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN); return(destination); } char* get_doc(destination, document_id, db, headline) char* destination; long document_id; database* db; boolean headline; { document_table_entry doc_entry; char filename[MAX_FILE_NAME_LEN], type[100]; char *hl; if (read_document_table_entry(&doc_entry, document_id, db) == true){ read_filename_table_entry(doc_entry.filename_id, filename, type, NULL, db); /* francois - multitype extension */ if ( strstr(type,",") != NULL ) { type[strstr(type,",") - type] = '\0'; } if (headline == TRUE) { hl = read_headline_table_entry(doc_entry.headline_id,db); sprintf(destination, "%d %d %s, \"%s\"", doc_entry.start_character, doc_entry.end_character, filename, hl); } else sprintf(destination, "%d %d %s", doc_entry.start_character, doc_entry.end_character, filename); return(s_strdup(type)); } else return NULL; } long next_doc(destination, docID, db) char* destination; char* docID; database* db; { long i, start, end; char doc[MAX_FILE_NAME_LEN+50], fn[MAX_FILE_NAME_LEN]; char *type, *loc; for(i = 0; i < db->doc_table_allocated_entries; i++) { if ((type = get_doc(doc, i, db, FALSE)) != NULL) { s_free(type); if (strcmp(doc, docID) == 0) { type = get_doc(doc, i+1, db, TRUE); sscanf(doc, "%d %d %s", &start, &end, fn); if((loc = strstr(doc, ",")) == NULL) return -1; fn[loc-doc] = 0; sprintf(destination, "%s, %s", doc, type); s_free(type); if( end != 0) return(end-start); else { /* whole file, find file length from the file */ long size; FILE* file = NULL; if (((file = s_fopen(fn, "r")) != NULL) && (fseek(file, 0L, SEEK_END) == 0) && ((size = ftell(file)) != -1)) { s_fclose(file); return(size); /* we are done, bytes is set */ } else { s_fclose(file); return(-1); /* something went wrong with the file */ } } } } } return -1; } long previous_doc(destination, docID, db) char* destination; char* docID; database* db; { long i, start, end; char doc[MAX_FILE_NAME_LEN+50], fn[MAX_FILE_NAME_LEN]; char *type, *loc; for(i = 0; i < db->doc_table_allocated_entries; i++) { if ((type = get_doc(doc, i, db, FALSE)) != NULL) { s_free(type); if (strcmp(doc, docID) == 0) { if (i != 0) { type = get_doc(doc, i-1, db, TRUE); sscanf(doc, "%d %d %s", &start, &end, fn); if((loc = strstr(doc, ",")) == NULL) return -1; fn[loc-doc] = 0; sprintf(destination, "%s, %s", doc, type); s_free(type); if( end != 0) return(end-start); else { /* whole file, find file length from the file */ long size; FILE* file = NULL; if (((file = s_fopen(fn, "r")) != NULL) && (fseek(file, 0L, SEEK_END) == 0) && ((size = ftell(file)) != -1)) { s_fclose(file); return(size); /* we are done, bytes is set */ } else { s_fclose(file); return(-1); /* something went wrong with the file */ } } } } } } return(-1); } long next_docid(docID, db) char* docID; database* db; { long i; char doc[MAX_FILE_NAME_LEN+50]; for(i = 0; i < db->doc_table_allocated_entries; i++) { if (get_doc(doc, i, db, FALSE) != NULL) { if (strcmp(doc, docID) == 0) { return (i+1); } } } return -1; } long previous_docid(docID, db) char* docID; database* db; { long i; char doc[MAX_FILE_NAME_LEN+50]; for(i = 0; i < db->doc_table_allocated_entries; i++) { if (get_doc(doc, i, db, FALSE) != NULL) { if (strcmp(doc, docID) == 0) { return (i-1); } } } return -1; }